library(tidyverse)
library(scales)
library(ggrepel)
theme_set(theme_light())
recent_grads <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
Major = [31mcol_character()[39m,
Major_category = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;251m[48;5;235m[38;5;251m[48;5;235m`spec()`[48;5;235m[38;5;251m[49m[39m for the full column specifications.
majors_processed <- recent_grads %>%
arrange(desc(Median)) %>%
mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median))
#Category of majors
majors_processed %>%
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category, Median, fill = Major_category)) +
geom_boxplot() +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
expand_limits(y=0) +
theme(legend.position = "none")
#Highest earning majors
majors_processed <- recent_grads %>%
arrange(desc(Median)) %>%
select(Major, Major_category, Median, P25th, P75th, Sample_size) %>%
mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median))
majors_processed %>%
filter(Sample_size >=100) %>%
head(20) %>%
ggplot(aes(Major,Median, color = Major_category)) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
coord_flip() +
labs(title="Highest earning Majors?",
subtitle = "Top 20 Majors with at least 100 students surveyed, bars are 25th and 75th percentile",
x="",
y="Median Salary of Graduates")
#Lowest earning majors
majors_processed %>%
tail(20) %>%
mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median)) %>%
ggplot(aes(Major,Median, color = Major_category)) +
geom_point() +
geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
coord_flip()
majors_processed %>%
ggplot(aes(Sample_size, Median)) +
geom_point() +
geom_text(aes(label=Major, check_overlap=TRUE,vjust=1, hjust=1)) +
scale_x_log10()
Ignoring unknown aesthetics: check_overlap
majors_processed %>%
mutate(Major_category = fct_reorder(Major, Total)) %>%
arrange(desc(Total)) %>%
head(20) %>%
ggplot(aes(Major_category, Total, fill=Major_category)) +
geom_col() +
coord_flip() +
labs(x="",
y = "Total Graduates #") +
scale_y_continuous(labels = comma_format()) +
theme(legend.position="none")
majors_processed %>%
arrange(desc(Total)) %>%
head(20) %>%
mutate(Major = fct_reorder(Major,Total)) %>%
gather(Gender, Number, Men, Women) %>%
select(Major, Gender, Number) %>%
ggplot(aes(Major, Number, fill = Gender)) +
geom_col() +
coord_flip()
by_major_category <- majors_processed %>%
filter(!is.na(Total)) %>%
group_by(Major_category) %>%
summarize(Men = sum(Men),
Women =sum(Women),
Total = sum(Total),
MedianSalary = sum( Median * Sample_size) / sum(Sample_size)) %>%
mutate(ShareWomen = Women / Total) %>%
arrange(desc(ShareWomen))
`summarise()` ungrouping output (override with `.groups` argument)
library(plotly)
g<- by_major_category %>%
ggplot(aes(ShareWomen, MedianSalary, color=Major_category)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_continuous(label = percent_format())+
scale_y_continuous(label = dollar_format()) +
expand_limits(y=0)
ggplotly(g)
`geom_smooth()` using formula 'y ~ x'
majors_processed %>%
select(Major, Total, ShareWomen, Sample_size, Median) %>%
lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
summary()
Call:
lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
Weighted Residuals:
Min 1Q Median 3Q Max
-260500 -61042 -13899 33262 865081
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 52073 1436 36.255 <2e-16 ***
ShareWomen -23650 2403 -9.842 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 123000 on 170 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.363, Adjusted R-squared: 0.3592
F-statistic: 96.87 on 1 and 170 DF, p-value: < 2.2e-16
library(broom)
majors_processed %>%
select(Major, Major_category, Total, ShareWomen, Sample_size, Median) %>%
add_count(Major_category) %>%
filter(n>=10) %>%
nest(-Major_category) %>%
mutate(model = map(data, ~lm(Median ~ ShareWomen, data = ., weights = Sample_size)),
tidied = map(model,tidy)) %>%
unnest(tidied) %>%
filter(term == "ShareWomen") %>%
arrange(estimate) %>%
mutate(fdr = p.adjust(p.value, method = "fdr"))
All elements of `...` must be named.
Did you want `data = c(Major, Total, ShareWomen, Sample_size, Median, n)`?